import json
import pickle
import pandas as pd
from pathlib import Path
from config import Config

def load_raw_data(file_path):
    """加载原始聊天记录"""
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def preprocess_data(raw_data):
    """预处理聊天数据"""
    pairs = []
    for conversation in raw_data:
        # 假设每条对话是连续的问答对
        for i in range(len(conversation) - 1):
            pairs.append({
                'question': conversation[i],
                'answer': conversation[i+1]
            })
    return pd.DataFrame(pairs)

def save_processed_data(df, file_path):
    """保存处理后的数据"""
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

def load_processed_data(file_path):
    """加载处理后的数据"""
    with open(file_path, 'rb') as f:
        return pickle.load(f)

def prepare_data():
    """准备数据"""
    raw_data = load_raw_data(Config.RAW_DATA_PATH)
    processed_df = preprocess_data(raw_data)
    save_processed_data(processed_df, Config.PROCESSED_DATA_PATH)
    return processed_df